Skip to content
Snippets Groups Projects
Commit 4f9b8467 authored by Marc Feger's avatar Marc Feger
Browse files

Add A2.1

parents
Branches
No related tags found
Loading
.idea/
__pycache__
\ No newline at end of file
from typing import List, Dict
class FileReader(object):
def __init__(self, filename):
self.filename = filename
def return_file_as_list_of_documents(self, skip_first_line=False) -> List[Dict]:
"""
This private function reads the file and returns it as a list of dicts.
:return: The file as dict.
"""
file_as_list = []
with open(self.filename) as file:
if skip_first_line:
next(file)
for line in file:
line = line.replace('\n', '')
line = line.split('\t')
file_as_list.append({
"filename": line[0],
"text": line[1]
})
return file_as_list
Bildname Bildbeschreibung
1 drei Raben auf einem Baum
2 Tiger in Indien
3 Sonnenblummenfelder im Sommer
4 süße kleine Tiger spielen an Baum
5 Mann von Tigern angegriffen
6 Tiger II Panzer
7 Apfelkuchen mit Sahne
8 der gestiefelte Kater Kostüm
9 Tiger Woods beim Golfen in Kalifornien
10 Tiger in freier Wildbahn
Bildname Bildbeschreibung
1 Kaffee Kaffee
2 Tee Tee Tasse Kanne Kanne
3 Kaffee Tasse Tasse Kanne
4 Kaffee Kaffee Kaffee Tee Tasse Tasse Tasse Kanne Kanne Kanne
5 Kanne Kanne Wasser Wasser
import copy
import sys
from collections import Counter
from typing import List, Dict
import nltk
import numpy as np
from nltk.corpus import stopwords
from FileReader import FileReader
EXAMPLE = False
LANGUAGE = 'german'
STOP_WORDS = stopwords.words(LANGUAGE)
def prepare_documents(documents: List[Dict]) -> None:
"""
This method adds the filtered and the filtered stemmed version if the documents text to the corresponding document.
:param documents: List of all documents.
:return: None
"""
for document in documents:
text_tokens = nltk.word_tokenize(document.get('text'), LANGUAGE)
document['filtered_text'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
document['filtered_stemmed_text'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in document['filtered_text']]
def prepare_query(query: Dict) -> None:
"""
This method prepared the query by adding the filtered and filtered stemmed version to the query.
:param query: Dict with the query information.
:return: None
"""
text_tokens = nltk.word_tokenize(query.get('text'), LANGUAGE)
query['filtered_query'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
query['filtered_stemmed_query'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']]
def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) -> List:
"""
This method concatenate all filtered and stemmed texts of all documents and the search query.
This is done because those filtered and stemmed words are used to get the index-term-vocabulary.
:param documents: List of all documents.
:param query: Dict of the query.
:return: The index-term-vocabulary.
"""
vocabulary = []
for document in documents:
vocabulary += document.get('filtered_stemmed_text')
vocabulary += query.get('filtered_stemmed_query')
return sorted(set(vocabulary))
def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example: bool = EXAMPLE) -> None:
"""
This method adds the counted version of the document vector to a given element as described in Chapter 1. Foil 33(3)
:param example: If True the calculation is done like A1.
:param target: A target list of dictionaries which contains the data to be changed.
:param vocabulary: The index term vocabulary.
:param key: Name of the key which should be used to generate the document vector.
:return:
"""
for element in target:
counted_document_vector = {}
relative_counted_document_vector = {}
counted_term = dict(Counter(element.get(key)))
for term in vocabulary:
counted_document_vector[term] = 0
relative_counted_document_vector[term] = 0
if term in counted_term.keys():
counted_document_vector[term] = counted_term.get(term)
if example:
relative_counted_document_vector[term] = counted_term.get(term)
else:
# Maybe the relative count of all terms per document and not all terms in the vocabulary ?
relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary)
element['counted_' + key] = counted_document_vector
element['relative_' + key] = relative_counted_document_vector
def get_df(documents: List[Dict], vocabulary: List) -> Dict:
"""
This method creates the df used in tf-idf.
It stores the amount of each term over all documents.
:param documents: All documents
:param vocabulary: The index term vocabulary
:return: df
"""
df = {}
for term in vocabulary:
df[term] = 0
for document in documents:
for term, count in document.get('counted_filtered_stemmed_text').items():
df[term] += (count != 0) # count existence in documents
return df
def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE) -> None:
"""
This method adds th tf-idf as weight to the document vector as described in Chapter 1. Foil 33(4).
:param target: A list of targets to be weighted with tf-idf.
:param df: df to use in idf.
:param key: Is the target a document or query?
:param example: To run example of A1.
:return: None
"""
for element in target:
document_vector = copy.deepcopy(element.get('relative_filtered_stemmed_' + key))
for term in element.get('filtered_stemmed_' + key):
if df.get(term) == 0:
document_vector[term] = 0
else:
# Be careful with log10
if example:
document_vector[term] = (element.get('counted_filtered_stemmed_' + key)[term] *
np.log10(len(df.keys()) / df.get(term)))
else:
document_vector[term] = element.get('relative_filtered_stemmed_' + key)[term] * \
(element.get('counted_filtered_stemmed_' + key)[term] *
np.log10(len(df.keys()) / df.get(term)))
element['document_vector'] = document_vector
def cosine(v1: List, v2: List):
"""
Calculates the cosine-similarity between two vectors.
:param v1: Vector 1.
:param v2: Vector 2.
:return: Cosine-Similarity of v1 and v2.
"""
v1 = np.array(v1)
v2 = np.array(v2)
counter = np.dot(v1, v2)
divider = np.linalg.norm(v1) * np.linalg.norm(v2)
if divider == 0:
return 0
return counter / divider
def add_cosine_similarity(documents: List[Dict], query: Dict) -> None:
"""
Adds the cosine-similarity between a document and the query.
:param documents: A list of documents.
:param query: The query to be looked for.
:return: None
"""
v2 = [value for _, value in query.get('document_vector').items()]
for document in documents:
v1 = [value for _, value in document.get('document_vector').items()]
document['cosine'] = round(cosine(v1, v2), 3)
def print_results(documents: List[Dict]) -> None:
"""
Pretty print the documents.
:param documents: List of documents
:return: None
"""
print('###############################################################')
documents = sorted(documents, key=lambda k: k['cosine'], reverse=True)
top5_documents = documents[:5]
for document in top5_documents:
print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine')))
print('###############################################################')
if __name__ == '__main__':
"""
This program calculates the cosine-similarity between als documents of argv[1] and the query argv[2].
It reduces the sentences with a the german snowball-stemmer. Then it calculates the index-document-vocabulary by
chaining the BoW of all documents and the query to a set of all stemmed words. Then the document-vectors are calculated
like Chapter 1. Foil 33.
To run the example of A1:
-> Change example to True
run $ python program.py meta_a1.txt "Tasse Kanne"
Else
-> Make sure example is False.
run $ python program.py meta.txt "<query>"
:arg 1: File which contains all documents (Two columns separated with \t and ends with \n)
:arg 2: Query to be looked for (must be surrounded with "")
"""
nltk.download('punkt')
nltk.download('stopwords')
all_documents = FileReader(filename=sys.argv[1]).return_file_as_list_of_documents(skip_first_line=True)
prepare_documents(all_documents)
search_query = {'text': str(sys.argv[2])}
prepare_query(search_query)
index_term_vocabulary = assemble_index_term_vocabulary_from_(all_documents, search_query)
add_document_vectors(all_documents, index_term_vocabulary, 'filtered_stemmed_text')
add_document_vectors([search_query], index_term_vocabulary, 'filtered_stemmed_query')
df_documents = get_df(all_documents, index_term_vocabulary)
add_tf_idf_as_weight(all_documents, df_documents, 'text')
add_tf_idf_as_weight([search_query], df_documents, 'query')
add_cosine_similarity(all_documents, search_query)
#
# print('All Documents: ' + json.dumps(all_documents, indent=3))
# print('Search Query: ' + json.dumps(search_query, indent=3))
#
print_results(all_documents)
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('german')
text = "Ich bin der der das hier gerne macht und wir gingen weg"
text_tokens = word_tokenize(text, 'german')
filtered_text = sorted([w for w in text_tokens if w not in stop_words])
filtered_stemmed_text = [SnowballStemmer('german').stem(w) for w in filtered_text]
print(filtered_text)
print(filtered_stemmed_text)
query = "Ich mag das sehr gerne, da ich gerne gehe"
query_tokens = word_tokenize(query)
filtered_query = word_tokenize(query, 'german')
filtered_stemmed_query = sorted([w for w in query_tokens if w not in stop_words])
print(filtered_query)
print(filtered_stemmed_query)
index_term_vocabulary = sorted(set(filtered_stemmed_text + filtered_stemmed_query))
print(index_term_vocabulary)
nltk==3.4
numpy==1.16.2
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment