From 28342786268f2d4af5a54cc64c7b3ccb8e442d22 Mon Sep 17 00:00:00 2001 From: feger <marc.feger@hhu.de> Date: Sun, 14 Apr 2019 16:46:28 +0200 Subject: [PATCH] Add A2.2 and add more doc-strings --- Aufgabe2/FileReader.py | 10 +++ Aufgabe2/FileWriter.py | 17 ++++++ Aufgabe2/files/output.documents | 1 + Aufgabe2/files/output.query | 1 + Aufgabe2/program.py | 39 +++++++++--- Aufgabe2/rocchio.py | 105 ++++++++++++++++++++++++++++++++ 6 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 Aufgabe2/FileWriter.py create mode 100644 Aufgabe2/files/output.documents create mode 100644 Aufgabe2/files/output.query create mode 100644 Aufgabe2/rocchio.py diff --git a/Aufgabe2/FileReader.py b/Aufgabe2/FileReader.py index 3c9c27b..9a970af 100644 --- a/Aufgabe2/FileReader.py +++ b/Aufgabe2/FileReader.py @@ -1,3 +1,4 @@ +import ast from typing import List, Dict @@ -24,3 +25,12 @@ class FileReader(object): "text": line[1] }) return file_as_list + + def read_document_and_eval_type_of_content(self): + """ + This private function reads the first query results and returns it as a list of dicts. + + :return: The file as dict. + """ + with open(self.filename) as f: + return ast.literal_eval(f.read()) diff --git a/Aufgabe2/FileWriter.py b/Aufgabe2/FileWriter.py new file mode 100644 index 0000000..d1e979d --- /dev/null +++ b/Aufgabe2/FileWriter.py @@ -0,0 +1,17 @@ +import json + + +class FileWriter(object): + + def __init__(self, filename: str, documents): + self.filename: str = filename + self.documents = documents + + def write_to_file(self): + """ + Writes a document to a file. + + :return: None + """ + with open(self.filename, 'w') as file: + json.dump(self.documents, file) diff --git a/Aufgabe2/files/output.documents b/Aufgabe2/files/output.documents new file mode 100644 index 0000000..843185f --- /dev/null +++ b/Aufgabe2/files/output.documents @@ -0,0 +1 @@ +[{"filename": "1", "text": "drei Raben auf einem Baum", "filtered_text": ["Baum", "Raben", "drei"], "filtered_stemmed_text": ["baum", "rab", "drei"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 1, "beim": 0, "drei": 1, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 1, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0.3333333333333333, "beim": 0, "drei": 0.3333333333333333, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0.3333333333333333, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0.3656366710026855, "beim": 0, "drei": 0.4659800028906792, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0.4659800028906792, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "2", "text": "Tiger in Indien", "filtered_text": ["Indien", "Tiger"], "filtered_stemmed_text": ["indi", "tig"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 1, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0.5, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.5, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0.6989700043360189, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.309894379144197, "wildbahn": 0, "wood": 0}, "cosine": 0.101}, {"filename": "3", "text": "Sonnenblummenfelder im Sommer", "filtered_text": ["Sommer", "Sonnenblummenfelder"], "filtered_stemmed_text": ["somm", "sonnenblummenfeld"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 1, "sonnenblummenfeld": 1, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0.5, "sonnenblummenfeld": 0.5, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0.6989700043360189, "sonnenblummenfeld": 0.6989700043360189, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "4", "text": "s\u00fc\u00dfe kleine Tiger spielen an Baum", "filtered_text": ["Baum", "Tiger", "kleine", "spielen", "s\u00fc\u00dfe"], "filtered_stemmed_text": ["baum", "tig", "klein", "spiel", "suss"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 1, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 1, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 1, "suss": 1, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0.2, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0.2, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0.2, "suss": 0.2, "tig": 0.2, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0.2193820026016113, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0.27958800173440757, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0.27958800173440757, "suss": 0.27958800173440757, "tig": 0.1239577516576788, "wildbahn": 0, "wood": 0}, "cosine": 0.056}, {"filename": "5", "text": "Mann von Tigern angegriffen", "filtered_text": ["Mann", "Tigern", "angegriffen"], "filtered_stemmed_text": ["mann", "tig", "angegriff"], "counted_filtered_stemmed_text": {"angegriff": 1, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 1, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0.3333333333333333, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0.3333333333333333, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0.4659800028906792, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0.4659800028906792, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0, "wood": 0}, "cosine": 0.074}, {"filename": "6", "text": "Tiger II Panzer", "filtered_text": ["II", "Panzer", "Tiger"], "filtered_stemmed_text": ["ii", "panz", "tig"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 1, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 1, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0.3333333333333333, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0.3333333333333333, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0.4659800028906792, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0.4659800028906792, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0, "wood": 0}, "cosine": 0.074}, {"filename": "7", "text": "Apfelkuchen mit Sahne", "filtered_text": ["Apfelkuchen", "Sahne"], "filtered_stemmed_text": ["apfelkuch", "sahn"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 1, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 1, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0.5, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0.5, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0.6989700043360189, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0.6989700043360189, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "8", "text": "der gestiefelte Kater Kost\u00fcm", "filtered_text": ["Kater", "Kost\u00fcm", "gestiefelte"], "filtered_stemmed_text": ["kat", "kostum", "gestiefelt"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 1, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 1, "klein": 0, "kostum": 1, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0.3333333333333333, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0.3333333333333333, "klein": 0, "kostum": 0.3333333333333333, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0.4659800028906792, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0.4659800028906792, "klein": 0, "kostum": 0.4659800028906792, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "9", "text": "Tiger Woods beim Golfen in Kalifornien", "filtered_text": ["Golfen", "Kalifornien", "Tiger", "Woods", "beim"], "filtered_stemmed_text": ["golf", "kaliforni", "tig", "wood", "beim"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 1, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 1, "ii": 0, "indi": 0, "kaliforni": 1, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 1}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0.2, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0.2, "ii": 0, "indi": 0, "kaliforni": 0.2, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.2, "wildbahn": 0, "wood": 0.2}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0.27958800173440757, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0.27958800173440757, "ii": 0, "indi": 0, "kaliforni": 0.27958800173440757, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.1239577516576788, "wildbahn": 0, "wood": 0.27958800173440757}, "cosine": 0.327}, {"filename": "10", "text": "Tiger in freier Wildbahn", "filtered_text": ["Tiger", "Wildbahn", "freier"], "filtered_stemmed_text": ["tig", "wildbahn", "freier"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 1, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 1, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.3333333333333333, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0.3333333333333333, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.4659800028906792, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0.4659800028906792, "wood": 0}, "cosine": 0.829}] \ No newline at end of file diff --git a/Aufgabe2/files/output.query b/Aufgabe2/files/output.query new file mode 100644 index 0000000..00a3f07 --- /dev/null +++ b/Aufgabe2/files/output.query @@ -0,0 +1 @@ +{"text": "Tiger Woods in freier Wildbahn", "filtered_query": ["Tiger", "Wildbahn", "Woods", "freier"], "filtered_stemmed_query": ["tig", "wildbahn", "wood", "freier"], "counted_filtered_stemmed_query": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 1, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 1, "wood": 1}, "relative_filtered_stemmed_query": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.25, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.25, "wildbahn": 0.25, "wood": 0.25}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.34948500216800943, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.1549471895720985, "wildbahn": 0.34948500216800943, "wood": 0.34948500216800943}} \ No newline at end of file diff --git a/Aufgabe2/program.py b/Aufgabe2/program.py index cfe71a9..34d48a6 100644 --- a/Aufgabe2/program.py +++ b/Aufgabe2/program.py @@ -9,6 +9,7 @@ import numpy as np from nltk.corpus import stopwords from FileReader import FileReader +from FileWriter import FileWriter EXAMPLE = False LANGUAGE = 'german' @@ -80,7 +81,8 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example relative_counted_document_vector[term] = counted_term.get(term) else: # Maybe the relative count of all terms per document and not all terms in the vocabulary ? - relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary) + # Maybe len(vocabulary) + relative_counted_document_vector[term] = counted_term.get(term) / len(element.get(key)) element['counted_' + key]: Dict = counted_document_vector element['relative_' + key]: Dict = relative_counted_document_vector @@ -152,32 +154,46 @@ def cosine(v1: List, v2: List): return counter / divider -def add_cosine_similarity(documents: List[Dict], query: Dict) -> None: +def add_cosine_similarity(documents: List[Dict], query: Dict, tag='') -> None: """ Adds the cosine-similarity between a document and the query. + :param tag: This can be used to add a new cosine similarity with a new document vector of the query. :param documents: A list of documents. :param query: The query to be looked for. :return: None """ - v2: List = [value for _, value in query.get('document_vector').items()] + + if tag != '': + tag = '_' + tag + + v2: List = [value for _, value in query.get('document_vector' + tag).items()] for document in documents: v1: List = [value for _, value in document.get('document_vector').items()] - document['cosine']: Double = round(cosine(v1, v2), 3) + document['cosine' + tag]: Double = round(cosine(v1, v2), 3) -def print_results(documents: List[Dict]) -> None: +def print_results(documents: List[Dict], query: Dict, tag='') -> None: """ Pretty print the documents. + :param query: The search query. + :param tag: This can be used to use a new cosine similarity with a new document vector of the query. :param documents: List of documents :return: None """ + + if tag != '': + tag = '_' + tag + print('###############################################################') - documents: List[Dict] = sorted(documents, key=lambda k: k['cosine'], reverse=True) + documents: List[Dict] = sorted(documents, key=lambda k: k['cosine' + tag], reverse=True) top5_documents: List[Dict] = documents[:5] + print('Query: ' + query.get('text')) for document in top5_documents: - print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine'))) + print('Document: ' + document.get('filename') + ', Text: ' + document.get( + 'text') + ', has a cosine similarity of: ' + str( + round(document.get('cosine' + tag), 3))) print('###############################################################') @@ -190,10 +206,10 @@ if __name__ == '__main__': To run the example of A1: -> Change example to True - run $ python program.py meta_a1.txt "Tasse Kanne" + run $ python program.py ./files/meta_a1.txt "Tasse Kanne" Else -> Make sure example is False. - run $ python program.py meta.txt "<query>" + run $ python program.py ./files/meta.txt "<query>" :arg 1: File which contains all documents (Two columns separated with \t and ends with \n) :arg 2: Query to be looked for (must be surrounded with "") @@ -224,4 +240,7 @@ if __name__ == '__main__': # print('All Documents: ' + json.dumps(all_documents, indent=3)) # print('Search Query: ' + json.dumps(search_query, indent=3)) # - print_results(all_documents) + print_results(all_documents, search_query) + + FileWriter('./files/output.documents', all_documents).write_to_file() + FileWriter('./files/output.query', search_query).write_to_file() diff --git a/Aufgabe2/rocchio.py b/Aufgabe2/rocchio.py new file mode 100644 index 0000000..7f8e7e9 --- /dev/null +++ b/Aufgabe2/rocchio.py @@ -0,0 +1,105 @@ +from tokenize import Double +from typing import List, Dict, Tuple + +import numpy as np + +from FileReader import FileReader +from program import print_results, add_cosine_similarity + + +def assemble_dr_dnr(feedback: List, documents: List[Dict]) -> Tuple[List, List]: + """ + This method assembles the Dr and Dnr for Rocchio. + It is assumed that all elements in Dr will be out of the TOP-5 Elements and Dnr will be all other elements. + Therefore it is assumed that: Dr U Dnr = D (all documents) + + :param feedback: List of relevant document names + :param documents: List of all documents in the system + :return: Dr, Dnr + """ + + unique_documents = list(set([int(document.get('filename')) for document in documents])) + dr = list(set([int(d) for d in unique_documents if d in feedback])) + dnr = list(set([int(d) for d in unique_documents if d not in dr])) + + return dr, dnr + + +def dict_to_list(target: Dict) -> List: + """ + Turns a dict in a list. + + :param target: Dict to be turned into a list. + :return: A list containing all values of a dict + """ + return [value for _, value in target.items()] + + +def q_m(query: Dict, documents: List[Dict], feedback: List, alpha: Double, beta: Double, gamma: Double): + """ + Calculates the new query vector as defined by Rocchio. + + :param query: The search query + :param documents: The list of all documents + :param feedback: The list with the relevant document names + :param alpha: Value for alpha + :param beta: Value for beta + :param gamma: Value for gamma + :return: None + """ + relevant_documents, not_relevant_documents = assemble_dr_dnr(feedback, documents) + + addend_1: List = list(np.multiply(alpha, dict_to_list(query.get('document_vector')))) + + addend_2: List = list(np.zeros(len(query.get('document_vector')))) + if len(relevant_documents) != 0: + for document in documents: + if int(document.get('filename')) in relevant_documents: + addend_2 = list(np.add(addend_2, dict_to_list(document.get('document_vector')))) + + addend_2 = list(np.multiply((beta / len(relevant_documents)), addend_2)) + + addend_3: List = list(np.zeros(len(query.get('document_vector')))) + if len(not_relevant_documents) != 0: + for document in documents: + if int(document.get('filename')) in not_relevant_documents: + addend_3 = list(np.add(addend_3, dict_to_list(document.get('document_vector')))) + addend_3 = list(np.multiply((gamma / len(not_relevant_documents)), addend_3)) + + qm_tmp = list(np.add(addend_1, addend_2)) + qm_tmp = list(np.subtract(qm_tmp, addend_3)) + + qm = dict(list(zip(query.get('document_vector').keys(), qm_tmp))) + + query['document_vector_new'] = qm + + +if __name__ == '__main__': + """ + This program does a one step rocchio improvement for the search query. + The values for the search query along with the values for the document are taken of the corresponding + files in /files. Then the user is asked to give his feedback. + Notice: Input only valid document ids which are also in the Top 5. + + Before running this code make sure you have generated all files in /files. + To do so run e.g: + $ python program.py ./files/meta.txt "Tiger Woods in freier Wildbahn" + To run the code use: + $ python rocchio.py + """ + all_documents = FileReader('./files/output.documents').read_document_and_eval_type_of_content() + search_query = FileReader('./files/output.query').read_document_and_eval_type_of_content() + + print('Top-5 results:') + print_results(all_documents, search_query) + print('Attention: Input must be int! And please use only documents in the Top-5!') + + input_of_relevant_documents = [int(x) for x in + input("Please enter the names of the documents you find relevant: ").split()] + + q_m(search_query, all_documents, input_of_relevant_documents, 1, 0.8, 0.1) + + add_cosine_similarity(all_documents, search_query, 'new') + + print('Top-5 results (new):') + print_results(all_documents, search_query, 'new') -- GitLab