Select Git revision
-
Harald Scheidl authoredHarald Scheidl authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
main.py 16.67 KiB
from tokenize import String
from typing import Dict, List
from src import WIKIDATA_ENTRYPOINT, DBPEDIA_ENTRYPOINT
from src.lib.composer import Composer
from src.lib.decapper import Decapper
from src.lib.engine import SPARQLEngine
from src.lib.statement import Statement
from src.lib.reader import FileReader
from src.lib.writer import FileWriter
from src.lib.keys import ResultKeys
def assemble_wikidata_groundtruth_english_advanced() -> Dict:
"""
This method assembles the groundtruth for the data in wikidata.
It also writes them to /static/data/wikidata_groundtruth.txt
:return: Groundtruth of wikidata as JSON.
"""
# Title and Description
print('[GET]: All wikidata data only title and description')
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_1.sparql')
data = engine.get_json()
print('Done')
# Director and Author
print('[GET]: All wikidata data only director and author')
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_2.sparql')
data_tmp = engine.get_json()
Composer(target=data, source=data_tmp).add_keys_to_header()
Composer(target=data, source=data_tmp).add_values_to_bindings(
keys=[ResultKeys.director.value, ResultKeys.author.value])
print("Done")
# Cast and Published
print('[GET]: All wikidata data only cast and published')
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_3.sparql')
data_tmp = engine.get_json()
Composer(target=data, source=data_tmp).add_keys_to_header()
Composer(target=data, source=data_tmp).add_values_to_bindings(
keys=[ResultKeys.cast.value, ResultKeys.published.value])
print('Done')
# Genre, Duration and ProductionCompany
print('[GET]: All wikidata data only genre, duration and production company')
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_4.sparql')
data_tmp = engine.get_json()
Composer(target=data, source=data_tmp).add_keys_to_header()
Composer(target=data, source=data_tmp).add_values_to_bindings(
keys=[ResultKeys.genre.value, ResultKeys.duration.value, ResultKeys.production_company.value])
print('Done')
writer = FileWriter(destination='static/wikidata/wikidata_groundtruth.txt', data=data)
writer.as_json()
return data
def assemble_wikidata_groundtruth_english() -> Dict:
"""
This method assembles the groundtruth for the data in wikidata.
It also writes them to /static/data/wikidata_groundtruth.txt
:return: Groundtruth of wikidata as JSON.
"""
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_groundtruth.sparql')
data = engine.get_json()
writer = FileWriter(destination='static/wikidata/wikidata_groundtruth.txt', data=data)
writer.as_json()
return data
def assemble_dbpedia_groundtruth_english() -> Dict:
"""
This method assembles the groundtruth for the data in dbpedia.
It also writes them to /static/data/dbpedia_groundtruth.txt
:return: Groundtruth of dbpedia as JSON.
"""
print('[GET]: All dbpedia data about title and director')
engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_1.sparql')
data = engine.get_json()
print('Done')
print('[GET]: All dbpedia data only genre and subject')
engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_2.sparql')
data_tmp = engine.get_json()
print('Done')
Composer(target=data, source=data_tmp).add_keys_to_header()
Composer(target=data, source=data_tmp).add_values_to_bindings(
keys=[ResultKeys.genre.value, ResultKeys.subject.value])
print('[GET]: The rest')
engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_3.sparql')
data_tmp = engine.get_json()
print('Done')
Composer(target=data, source=data_tmp).add_keys_to_header()
Composer(target=data, source=data_tmp).add_values_to_bindings(
keys=[ResultKeys.author.value, ResultKeys.cast.value, ResultKeys.published.value, ResultKeys.duration.value,
ResultKeys.description.value, ResultKeys.distributor.value, ResultKeys.production_companies.value])
writer = FileWriter(destination='static/dbpedia/dbpedia_groundtruth.txt', data=data)
writer.as_json()
return data
def assemble_wikidata_triples() -> List:
"""
This method assembles the N-Triples of wikidata.
:return: List of all triples.
"""
data = FileReader(source='static/wikidata/wikidata_groundtruth.txt').as_json()
wdt = 'http://www.wikidata.org/prop/direct/'
schema = 'http://schema.org/'
rdfs = 'http://www.w3.org/2000/01/rdf-schema#'
triples: List[String] = []
print('[GET]: All wikidata data about bad films')
engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_bad_films.sparql')
bad_films = [Decapper(result[ResultKeys.movie.value]).unpack() for result in
engine.get_json()['results']['bindings']]
print('Done')
for result in data:
movie = Decapper(result[ResultKeys.movie.value]).unpack()
if movie in bad_films:
continue
if ResultKeys.title.value in result.keys():
for title in Decapper(result[ResultKeys.title.value]).unpack().split(ResultKeys.line_separator.value):
if title:
triples += [Statement(subject=movie,
predicate=rdfs + 'label',
value=title,
value_type='@en').as_etv_triple()]
if ResultKeys.director.value in result.keys():
for director in Decapper(result[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value):
if director:
triples += [Statement(subject=movie,
predicate=wdt + 'P57',
value=director).as_ete_triple()]
if ResultKeys.author.value in result.keys():
for author in Decapper(result[ResultKeys.author.value]).unpack().split(ResultKeys.line_separator.value):
if author:
triples += [Statement(subject=movie,
predicate=wdt + 'P58',
value=author).as_ete_triple()]
if ResultKeys.cast.value in result.keys():
for cast in Decapper(result[ResultKeys.cast.value]).unpack().split(ResultKeys.line_separator.value):
if cast:
triples += [Statement(subject=movie,
predicate=wdt + 'P161',
value=cast).as_ete_triple()]
if ResultKeys.published.value in result.keys():
for published in Decapper(result[ResultKeys.published.value]).unpack().split(
ResultKeys.line_separator.value):
if published:
triples += [Statement(subject=movie,
predicate=wdt + 'P577',
value=published,
value_type='^^xsd:dateTime').as_etv_triple()]
if ResultKeys.genre.value in result.keys():
for genre in Decapper(result[ResultKeys.genre.value]).unpack().split(ResultKeys.line_separator.value):
if genre:
triples += [Statement(subject=movie,
predicate=wdt + 'P136',
value=genre).as_ete_triple()]
if ResultKeys.duration.value in result.keys():
for duration in Decapper(result[ResultKeys.duration.value]).unpack().split(ResultKeys.line_separator.value):
if duration:
triples += [Statement(subject=movie,
predicate=wdt + 'P2047',
value=duration,
value_type='^^xsd:decimal').as_etv_triple()]
if ResultKeys.description.value in result.keys():
for description in Decapper(result[ResultKeys.description.value]).unpack().split(
ResultKeys.line_separator.value):
if description:
triples += [Statement(subject=movie,
predicate=schema + 'description',
value=description,
value_type='@en').as_etv_triple()]
if ResultKeys.production_company.value in result.keys():
for production_company in Decapper(result[ResultKeys.production_company.value]).unpack().split(
ResultKeys.line_separator.value):
if production_company:
triples += [Statement(subject=movie,
predicate=wdt + 'P272',
value=production_company).as_ete_triple()]
FileWriter(destination='static/wikidata/wikidata.txt', data=triples).as_string(new_line=True)
return triples
def assemble_dbpedia_triples() -> List:
"""
This method assembles the N-Triples of dbpedia.
:return: List of all triples.
"""
data = FileReader(source='static/dbpedia/dbpedia_groundtruth.txt').as_json()
foaf = 'http://xmlns.com/foaf/0.1/'
dbo = 'http://dbpedia.org/ontology/'
dct = 'http://purl.org/dc/terms/'
dbp = 'http://dbpedia.org/property/'
triples: List[String] = []
for result in data:
movie = Decapper(result[ResultKeys.movie.value]).unpack()
if ResultKeys.title.value in result.keys():
for title in Decapper(result[ResultKeys.title.value]).unpack().split(ResultKeys.line_separator.value):
if title:
triples += [Statement(subject=movie,
predicate=foaf + 'name',
value=title,
value_type='@en').as_etv_triple()]
if ResultKeys.director.value in result.keys():
for director in Decapper(result[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value):
if director:
triples += [Statement(subject=movie,
predicate=dbo + 'director',
value=director).as_ete_triple()]
if ResultKeys.author.value in result.keys():
for author in Decapper(result[ResultKeys.author.value]).unpack().split(ResultKeys.line_separator.value):
if author:
triples += [Statement(subject=movie,
predicate=dbo + 'author',
value=author).as_ete_triple()]
if ResultKeys.cast.value in result.keys():
for cast in Decapper(result[ResultKeys.cast.value]).unpack().split(ResultKeys.line_separator.value):
if cast:
triples += [Statement(subject=movie,
predicate=dbo + 'starring',
value=cast).as_ete_triple()]
if ResultKeys.published.value in result.keys():
for published in Decapper(result[ResultKeys.published.value]).unpack().split(
ResultKeys.line_separator.value):
if published:
triples += [Statement(subject=movie,
predicate=dbo + 'releaseDate',
value=published,
value_type='^^xsd:date').as_etv_triple()]
if ResultKeys.subject.value in result.keys():
for subject in Decapper(result[ResultKeys.subject.value]).unpack().split(ResultKeys.line_separator.value):
if subject:
triples += [Statement(subject=movie,
predicate=dct + 'subject',
value=subject).as_ete_triple()]
if ResultKeys.genre.value in result.keys():
for genre in Decapper(result[ResultKeys.genre.value]).unpack().split(ResultKeys.line_separator.value):
if genre:
triples += [Statement(subject=movie,
predicate=dbo + 'genre',
value=genre).as_ete_triple()]
if ResultKeys.duration.value in result.keys():
for duration in Decapper(result[ResultKeys.duration.value]).unpack().split(ResultKeys.line_separator.value):
if duration:
triples += [Statement(subject=movie,
predicate=dbo + 'runtime',
value=duration,
value_type='^^xsd:double').as_etv_triple()]
if ResultKeys.description.value in result.keys():
for description in Decapper(result[ResultKeys.description.value]).unpack().split(
ResultKeys.line_separator.value):
if description:
triples += [Statement(subject=movie,
predicate=dbo + 'abstract',
value=description,
value_type='@en').as_etv_triple()]
if ResultKeys.distributor.value in result.keys():
for distributor in Decapper(result[ResultKeys.distributor.value]).unpack().split(
ResultKeys.line_separator.value):
if distributor:
triples += [Statement(subject=movie,
predicate=dbo + 'distributor',
value=distributor).as_ete_triple()]
if ResultKeys.production_companies.value in result.keys():
for production_companies in Decapper(result[ResultKeys.production_companies.value]).unpack().split(
ResultKeys.line_separator.value):
if production_companies:
triples += [Statement(subject=movie,
predicate=dbp + 'productionCompanies',
value=production_companies).as_ete_triple()]
FileWriter(destination='static/dbpedia/dbpedia.txt', data=triples).as_string(new_line=True)
return triples
def assemble_similarity_triples() -> List:
"""
This method assembles the similarity between the wikidata and the dbpedia datase.
This will be done in two instances.
First level:
-> Take a look the owl:sameAs
Second level:
-> Take a look if two instances have the same name and the same director/directors
:return: None
"""
dbpedia_data = FileReader(source='static/dbpedia/dbpedia_groundtruth.txt').as_json()
wikidata_data = FileReader(source='static/wikidata/wikidata_groundtruth.txt').as_json()
triples = []
for wiki in wikidata_data:
wiki_movie = Decapper(wiki[ResultKeys.movie.value]).unpack()
for dbpedia in dbpedia_data:
if Decapper(wiki[ResultKeys.movie.value]).unpack() == \
Decapper(dbpedia[ResultKeys.movie_link.value]).unpack():
triples += [Statement(subject=wiki_movie,
predicate='http://www.w3.org/2002/07/owl#sameAs',
value=Decapper(dbpedia[ResultKeys.movie.value]).unpack()
).as_ete_triple()
]
break
if Decapper(wiki[ResultKeys.title.value]).unpack() == Decapper(dbpedia[ResultKeys.title.value]).unpack():
wiki_set = set(
Decapper(wiki[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value))
dbpedia_set = set(
Decapper(dbpedia[ResultKeys.director_link.value]).unpack().split(ResultKeys.line_separator.value))
if len(wiki_set & dbpedia_set) > 0:
triples += [Statement(subject=wiki_movie,
predicate='http://www.w3.org/2002/07/owl#sameAs',
value=Decapper(dbpedia[ResultKeys.movie.value]).unpack()
).as_ete_triple()
]
break
FileWriter(destination='static/coreferences/coreferences.txt', data=triples).as_string(new_line=True)
return triples