Skip to content
Snippets Groups Projects
Select Git revision
  • d51b5056b78a1b5dc83a6a914a7f66f6b885a00c
  • master default protected
2 results

main.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    main.py 16.67 KiB
    from tokenize import String
    from typing import Dict, List
    
    from src import WIKIDATA_ENTRYPOINT, DBPEDIA_ENTRYPOINT
    from src.lib.composer import Composer
    from src.lib.decapper import Decapper
    from src.lib.engine import SPARQLEngine
    from src.lib.statement import Statement
    from src.lib.reader import FileReader
    from src.lib.writer import FileWriter
    from src.lib.keys import ResultKeys
    
    
    def assemble_wikidata_groundtruth_english_advanced() -> Dict:
        """
        This method assembles the groundtruth for the data in wikidata.
        It also writes them to /static/data/wikidata_groundtruth.txt
    
        :return: Groundtruth of wikidata as JSON.
        """
    
        # Title and Description
        print('[GET]: All wikidata data only title and description')
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_1.sparql')
        data = engine.get_json()
        print('Done')
    
        # Director and Author
        print('[GET]: All wikidata data only director and author')
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_2.sparql')
        data_tmp = engine.get_json()
        Composer(target=data, source=data_tmp).add_keys_to_header()
        Composer(target=data, source=data_tmp).add_values_to_bindings(
            keys=[ResultKeys.director.value, ResultKeys.author.value])
        print("Done")
    
        # Cast and Published
        print('[GET]: All wikidata data only cast and published')
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_3.sparql')
        data_tmp = engine.get_json()
        Composer(target=data, source=data_tmp).add_keys_to_header()
        Composer(target=data, source=data_tmp).add_values_to_bindings(
            keys=[ResultKeys.cast.value, ResultKeys.published.value])
        print('Done')
    
        # Genre, Duration and ProductionCompany
        print('[GET]: All wikidata data only genre, duration and production company')
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_4.sparql')
        data_tmp = engine.get_json()
        Composer(target=data, source=data_tmp).add_keys_to_header()
        Composer(target=data, source=data_tmp).add_values_to_bindings(
            keys=[ResultKeys.genre.value, ResultKeys.duration.value, ResultKeys.production_company.value])
        print('Done')
    
        writer = FileWriter(destination='static/wikidata/wikidata_groundtruth.txt', data=data)
        writer.as_json()
        return data
    
    
    def assemble_wikidata_groundtruth_english() -> Dict:
        """
        This method assembles the groundtruth for the data in wikidata.
        It also writes them to /static/data/wikidata_groundtruth.txt
    
        :return: Groundtruth of wikidata as JSON.
        """
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_groundtruth.sparql')
        data = engine.get_json()
        writer = FileWriter(destination='static/wikidata/wikidata_groundtruth.txt', data=data)
        writer.as_json()
        return data
    
    
    def assemble_dbpedia_groundtruth_english() -> Dict:
        """
        This method assembles the groundtruth for the data in dbpedia.
        It also writes them to /static/data/dbpedia_groundtruth.txt
    
        :return: Groundtruth of dbpedia as JSON.
        """
        print('[GET]: All dbpedia data about title and director')
        engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_1.sparql')
        data = engine.get_json()
        print('Done')
    
        print('[GET]: All dbpedia data only genre and subject')
        engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_2.sparql')
        data_tmp = engine.get_json()
        print('Done')
    
        Composer(target=data, source=data_tmp).add_keys_to_header()
        Composer(target=data, source=data_tmp).add_values_to_bindings(
            keys=[ResultKeys.genre.value, ResultKeys.subject.value])
    
        print('[GET]: The rest')
        engine = SPARQLEngine(entrypoint=DBPEDIA_ENTRYPOINT, query='static/dbpedia/dbpedia_3.sparql')
        data_tmp = engine.get_json()
        print('Done')
    
        Composer(target=data, source=data_tmp).add_keys_to_header()
        Composer(target=data, source=data_tmp).add_values_to_bindings(
            keys=[ResultKeys.author.value, ResultKeys.cast.value, ResultKeys.published.value, ResultKeys.duration.value,
                  ResultKeys.description.value, ResultKeys.distributor.value, ResultKeys.production_companies.value])
    
        writer = FileWriter(destination='static/dbpedia/dbpedia_groundtruth.txt', data=data)
        writer.as_json()
        return data
    
    
    def assemble_wikidata_triples() -> List:
        """
        This method assembles the N-Triples of wikidata.
    
        :return: List of all triples.
        """
    
        data = FileReader(source='static/wikidata/wikidata_groundtruth.txt').as_json()
        wdt = 'http://www.wikidata.org/prop/direct/'
        schema = 'http://schema.org/'
        rdfs = 'http://www.w3.org/2000/01/rdf-schema#'
        triples: List[String] = []
    
        print('[GET]: All wikidata data about bad films')
        engine = SPARQLEngine(entrypoint=WIKIDATA_ENTRYPOINT, query='static/wikidata/wikidata_bad_films.sparql')
        bad_films = [Decapper(result[ResultKeys.movie.value]).unpack() for result in
                     engine.get_json()['results']['bindings']]
        print('Done')
    
        for result in data:
            movie = Decapper(result[ResultKeys.movie.value]).unpack()
    
            if movie in bad_films:
                continue
    
            if ResultKeys.title.value in result.keys():
                for title in Decapper(result[ResultKeys.title.value]).unpack().split(ResultKeys.line_separator.value):
                    if title:
                        triples += [Statement(subject=movie,
                                              predicate=rdfs + 'label',
                                              value=title,
                                              value_type='@en').as_etv_triple()]
    
            if ResultKeys.director.value in result.keys():
                for director in Decapper(result[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value):
                    if director:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P57',
                                              value=director).as_ete_triple()]
    
            if ResultKeys.author.value in result.keys():
                for author in Decapper(result[ResultKeys.author.value]).unpack().split(ResultKeys.line_separator.value):
                    if author:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P58',
                                              value=author).as_ete_triple()]
    
            if ResultKeys.cast.value in result.keys():
                for cast in Decapper(result[ResultKeys.cast.value]).unpack().split(ResultKeys.line_separator.value):
                    if cast:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P161',
                                              value=cast).as_ete_triple()]
    
            if ResultKeys.published.value in result.keys():
                for published in Decapper(result[ResultKeys.published.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if published:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P577',
                                              value=published,
                                              value_type='^^xsd:dateTime').as_etv_triple()]
            if ResultKeys.genre.value in result.keys():
                for genre in Decapper(result[ResultKeys.genre.value]).unpack().split(ResultKeys.line_separator.value):
                    if genre:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P136',
                                              value=genre).as_ete_triple()]
    
            if ResultKeys.duration.value in result.keys():
                for duration in Decapper(result[ResultKeys.duration.value]).unpack().split(ResultKeys.line_separator.value):
                    if duration:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P2047',
                                              value=duration,
                                              value_type='^^xsd:decimal').as_etv_triple()]
    
            if ResultKeys.description.value in result.keys():
                for description in Decapper(result[ResultKeys.description.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if description:
                        triples += [Statement(subject=movie,
                                              predicate=schema + 'description',
                                              value=description,
                                              value_type='@en').as_etv_triple()]
    
            if ResultKeys.production_company.value in result.keys():
                for production_company in Decapper(result[ResultKeys.production_company.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if production_company:
                        triples += [Statement(subject=movie,
                                              predicate=wdt + 'P272',
                                              value=production_company).as_ete_triple()]
        FileWriter(destination='static/wikidata/wikidata.txt', data=triples).as_string(new_line=True)
        return triples
    
    
    def assemble_dbpedia_triples() -> List:
        """
        This method assembles the N-Triples of dbpedia.
    
        :return: List of all triples.
        """
    
        data = FileReader(source='static/dbpedia/dbpedia_groundtruth.txt').as_json()
        foaf = 'http://xmlns.com/foaf/0.1/'
        dbo = 'http://dbpedia.org/ontology/'
        dct = 'http://purl.org/dc/terms/'
        dbp = 'http://dbpedia.org/property/'
    
        triples: List[String] = []
        for result in data:
            movie = Decapper(result[ResultKeys.movie.value]).unpack()
    
            if ResultKeys.title.value in result.keys():
                for title in Decapper(result[ResultKeys.title.value]).unpack().split(ResultKeys.line_separator.value):
                    if title:
                        triples += [Statement(subject=movie,
                                              predicate=foaf + 'name',
                                              value=title,
                                              value_type='@en').as_etv_triple()]
    
            if ResultKeys.director.value in result.keys():
                for director in Decapper(result[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value):
                    if director:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'director',
                                              value=director).as_ete_triple()]
    
            if ResultKeys.author.value in result.keys():
                for author in Decapper(result[ResultKeys.author.value]).unpack().split(ResultKeys.line_separator.value):
                    if author:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'author',
                                              value=author).as_ete_triple()]
    
            if ResultKeys.cast.value in result.keys():
                for cast in Decapper(result[ResultKeys.cast.value]).unpack().split(ResultKeys.line_separator.value):
                    if cast:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'starring',
                                              value=cast).as_ete_triple()]
    
            if ResultKeys.published.value in result.keys():
                for published in Decapper(result[ResultKeys.published.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if published:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'releaseDate',
                                              value=published,
                                              value_type='^^xsd:date').as_etv_triple()]
    
            if ResultKeys.subject.value in result.keys():
                for subject in Decapper(result[ResultKeys.subject.value]).unpack().split(ResultKeys.line_separator.value):
                    if subject:
                        triples += [Statement(subject=movie,
                                              predicate=dct + 'subject',
                                              value=subject).as_ete_triple()]
    
            if ResultKeys.genre.value in result.keys():
                for genre in Decapper(result[ResultKeys.genre.value]).unpack().split(ResultKeys.line_separator.value):
                    if genre:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'genre',
                                              value=genre).as_ete_triple()]
    
            if ResultKeys.duration.value in result.keys():
                for duration in Decapper(result[ResultKeys.duration.value]).unpack().split(ResultKeys.line_separator.value):
                    if duration:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'runtime',
                                              value=duration,
                                              value_type='^^xsd:double').as_etv_triple()]
    
            if ResultKeys.description.value in result.keys():
                for description in Decapper(result[ResultKeys.description.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if description:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'abstract',
                                              value=description,
                                              value_type='@en').as_etv_triple()]
    
            if ResultKeys.distributor.value in result.keys():
                for distributor in Decapper(result[ResultKeys.distributor.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if distributor:
                        triples += [Statement(subject=movie,
                                              predicate=dbo + 'distributor',
                                              value=distributor).as_ete_triple()]
    
            if ResultKeys.production_companies.value in result.keys():
                for production_companies in Decapper(result[ResultKeys.production_companies.value]).unpack().split(
                        ResultKeys.line_separator.value):
                    if production_companies:
                        triples += [Statement(subject=movie,
                                              predicate=dbp + 'productionCompanies',
                                              value=production_companies).as_ete_triple()]
        FileWriter(destination='static/dbpedia/dbpedia.txt', data=triples).as_string(new_line=True)
        return triples
    
    
    def assemble_similarity_triples() -> List:
        """
        This method assembles the similarity between the wikidata and the dbpedia datase.
        This will be done in two instances.
    
        First level:
            -> Take a look the owl:sameAs
        Second level:
            -> Take a look if two instances have the same name and the same director/directors
    
        :return: None
        """
        dbpedia_data = FileReader(source='static/dbpedia/dbpedia_groundtruth.txt').as_json()
        wikidata_data = FileReader(source='static/wikidata/wikidata_groundtruth.txt').as_json()
    
        triples = []
        for wiki in wikidata_data:
            wiki_movie = Decapper(wiki[ResultKeys.movie.value]).unpack()
            for dbpedia in dbpedia_data:
                if Decapper(wiki[ResultKeys.movie.value]).unpack() == \
                        Decapper(dbpedia[ResultKeys.movie_link.value]).unpack():
                    triples += [Statement(subject=wiki_movie,
                                          predicate='http://www.w3.org/2002/07/owl#sameAs',
                                          value=Decapper(dbpedia[ResultKeys.movie.value]).unpack()
                                          ).as_ete_triple()
                                ]
                    break
    
                if Decapper(wiki[ResultKeys.title.value]).unpack() == Decapper(dbpedia[ResultKeys.title.value]).unpack():
                    wiki_set = set(
                        Decapper(wiki[ResultKeys.director.value]).unpack().split(ResultKeys.line_separator.value))
                    dbpedia_set = set(
                        Decapper(dbpedia[ResultKeys.director_link.value]).unpack().split(ResultKeys.line_separator.value))
                    if len(wiki_set & dbpedia_set) > 0:
                        triples += [Statement(subject=wiki_movie,
                                              predicate='http://www.w3.org/2002/07/owl#sameAs',
                                              value=Decapper(dbpedia[ResultKeys.movie.value]).unpack()
                                              ).as_ete_triple()
                                    ]
                        break
        FileWriter(destination='static/coreferences/coreferences.txt', data=triples).as_string(new_line=True)
        return triples