Skip to content
Snippets Groups Projects
Select Git revision
  • 2b75217bf8bef7737632efee1ebb1ddbd8cfefe6
  • master default protected
  • exec_auto_adjust_trace
  • let_variables
  • v1.4.1
  • v1.4.0
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.0
10 results

build.gradle

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    dbpedia_analyser.py 10.53 KiB
    from tokenize import String
    from typing import Dict
    
    from src import DBPEDIA_ENTRYPOINT
    from src.lib.decapper import Decapper
    from src.lib.engine import SPARQLEngine
    from src.lib.keys import ResultKeys
    
    
    def __get_all_films_with_offset(offset: int = 0) -> String:
        """
        This method returns the query to find all movies reachable in DBPedia.
        :param offset: Offset to start at.
        :return: The query
        """
        return """
                PREFIX dbo: <http://dbpedia.org/ontology/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
                SELECT DISTINCT (?movie AS ?Movie)
                WHERE {
                    ?movie rdf:type dbo:Film
                } 
                ORDER BY DESC(?movie)
                OFFSET 
            """ + str(offset)
    
    
    def __get_all_comedy_films_with_offset(offset: int = 0) -> String:
        """
        This method returns the query to find all comedy movies reachable in DBPedia.
        :param offset: Offset to start at.
        :return: The query
        """
        return """
                PREFIX dbo: <http://dbpedia.org/ontology/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
                SELECT DISTINCT (?movie AS ?Movie)
                WHERE {
                    ?movie a <http://dbpedia.org/ontology/Film> .
                    ?movie dct:subject ?subject.
                    FILTER( regex(?subject, "comedy","i" ))
                } 
                ORDER BY DESC(?movie)
                OFFSET 
            """ + str(offset)
    
    
    def __get_all_comedy_films_with_directors_birthday_offset(offset: int = 0) -> String:
        """
        This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday.
        :param offset: Offset to start at.
        :return: The query
        """
        return """
                PREFIX dbo: <http://dbpedia.org/ontology/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
                SELECT DISTINCT (?movie AS ?Movie)
                WHERE {
                    ?movie a <http://dbpedia.org/ontology/Film> .
                    ?movie dct:subject ?subject.
                    FILTER( regex(?subject, "comedy","i" ))
                    ?movie dbo:director [dbo:birthDate ?dob].
                } 
                ORDER BY DESC(?movie)
                OFFSET 
            """ + str(offset)
    
    
    def __get_all_comedy_films_with_directors_birthday_before_1970_offset(offset: int = 0) -> String:
        """
        This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday before 1970.
        :param offset: Offset to start at.
        :return: The query
        """
        return """
                PREFIX dbo: <http://dbpedia.org/ontology/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
                SELECT DISTINCT (?movie AS ?Movie)
                WHERE {
                    ?movie a <http://dbpedia.org/ontology/Film> .
                    ?movie dct:subject ?subject.
                    FILTER( regex(?subject, "comedy","i" ))
                    ?movie dbo:director [dbo:birthDate ?dob].
                	FILTER (xsd:integer(substr(xsd:string(?dob), 0, 4)) < 1970).
                }  
                ORDER BY DESC(?movie)
                OFFSET 
            """ + str(offset)
    
    
    def __get_all_comedy_films_with_directors_birthday_after_or_on_1970_offset(offset: int = 0) -> String:
        """
        This method returns the query to find all comedy movies reachable in DBPedia who have a director with birthday on or after 1970.
        :param offset: Offset to start at.
        :return: The query
        """
        return """
                PREFIX dbo: <http://dbpedia.org/ontology/>
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
                SELECT DISTINCT (?movie AS ?Movie)
                WHERE {
                    ?movie a <http://dbpedia.org/ontology/Film> .
                    ?movie dct:subject ?subject.
                    FILTER( regex(?subject, "comedy","i" ))
                    ?movie dbo:director [dbo:birthDate ?dob].
                	FILTER (xsd:integer(substr(xsd:string(?dob), 0, 4)) >= 1970).
                }  
                ORDER BY DESC(?movie)
                OFFSET 
            """ + str(offset)
    
    
    def __get_interlinks_from_dbpedia_to_wikidata() -> String:
        """
        This method gets the interlink information from the dbpedia data to the wikidata data.
    
        :return: Information about the interlinks
        """
    
        return """
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX dbc: <http://dbpedia.org/resource/Category:>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dct: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX dbp:	<http://dbpedia.org/property/>
    
        SELECT
        (COUNT(DISTINCT ?movie) AS ?Movie)
        (COUNT(DISTINCT ?same) AS ?MovieLink)
        (COUNT(DISTINCT ?name) AS ?Title)
        (COUNT(DISTINCT ?director) AS ?Director)
        (COUNT(DISTINCT ?sameDirector) AS ?DirectorLink)
        (COUNT(DISTINCT ?directorName) AS ?DirectorName)
        WHERE {
           {
            SELECT DISTINCT ?movie
            WHERE {
                SELECT DISTINCT ?movie ?min_year ?max_year
                WHERE {
                    SELECT DISTINCT ?movie ?director (MIN(?year) AS ?min_year) (MAX(?year) AS ?max_year)
                    WHERE{
                            # X type Film
                            {?movie rdf:type dbo:Film.}
    
                            # Imagine a inner join
    
                            # X subject Comedy
                            {?movie dct:subject dbc:Comedy}
                            UNION
                            # X subject Y; Y like "Comedy".
                            {?movie dct:subject ?y FILTER CONTAINS(lcase(str(?y)), "comedy").}
                            UNION
                            # X genre Y; Y like "Comedy".
                            {?movie dbo:genre ?y FILTER CONTAINS (lcase(str(?y)), "comedy").}
                            # X director  Person; Person birth date year
                            {?movie dbo:director ?director.
                            ?director dbo:birthDate ?dob.
                            BIND(xsd:integer(substr(xsd:string(?dob), 0, 4)) as ?year) .
                            }
                    } GROUP BY ?movie ?director
                }GROUP BY ?movie HAVING (?min_year = ?max_year)
            }GROUP BY ?movie HAVING (?min_year >= 1970)
         }
        OPTIONAL{?movie foaf:name ?name FILTER(LANG(?name)="en").}
        OPTIONAL{
            ?movie dbo:director ?director.
            OPTIONAL{
                ?director owl:sameAs ?sameDirector FILTER CONTAINS (LCASE(STR(?sameDirector)), 'wikidata.org').
            }
        }
        OPTIONAL{
            {?movie dbo:director ?director.
            ?director rdfs:label ?directorName FILTER(LANG(?directorName)="en").}
        }
        OPTIONAL{?movie owl:sameAs ?same FILTER CONTAINS (LCASE(STR(?same)), 'wikidata.org').}
    }
        """
    
    
    def get_statistics(entrypoint: String, query: String, info: String) -> Dict:
        """
        This methods counts all movies for given query.
    
        :param info: What is the info
        :param entrypoint: Entrypoint to use.
        :param query: The query to be looked for.
        :return: Dict with information of the result
        """
        offset = 0
        itterateable = True
        entities = []
        while itterateable:
    
            engine = SPARQLEngine(entrypoint=entrypoint, query=query(offset))
            results = engine.get_json_with_query()['results']['bindings']
            if len(results) == 0:
                itterateable = False
    
            results = [Decapper(result[ResultKeys.movie.value]).unpack() for result in results]
            entities += set(results)
            offset += 10000
    
        independent = len(set(entities))
        return {
            'info': info,
            'entrypoint': entrypoint,
            'independent': independent
        }
    
    
    def get_interlinks(entrypoint: String, query: String, info: String) -> Dict:
        """
        This method gets all interlink information of the dbpedia results.
    
        :param info: What is the info
        :param entrypoint: Entrypoint to use.
        :param query: The query to be looked for.
        :return: Dict with information of the result
        """
        engine = SPARQLEngine(entrypoint=entrypoint, query=query).get_json_with_query()
        return {
            'info': info,
            'entrypoint': entrypoint,
            'movies': int(Decapper(engine['results']['bindings'][0][ResultKeys.movie.value]).unpack()),
            'moviesLinked': int(Decapper(engine['results']['bindings'][0][ResultKeys.movie_link.value]).unpack()),
            'moviesNames': int(Decapper(engine['results']['bindings'][0][ResultKeys.title.value]).unpack()),
            'directors': int(Decapper(engine['results']['bindings'][0][ResultKeys.director.value]).unpack()),
            'directorsLinked': int(Decapper(engine['results']['bindings'][0][ResultKeys.director_link.value]).unpack()),
            'directorsNames': int(Decapper(engine['results']['bindings'][0]['DirectorName']).unpack())
        }
    
    
    def get_all_statistics() -> Dict:
        """
        This method gets all statistics of dbpedia.
    
        :return: Dict with all infos.
        """
        return {
            'all_films_in_dbpedia': get_statistics(DBPEDIA_ENTRYPOINT, __get_all_films_with_offset, 'all_films_in_dbpedia'),
            'all_comedy_films_in_dbpedia': get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_offset,
                                                          'all_comedy_films_in_dbpedia'),
            'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday':
                get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_offset,
                               'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday'),
            'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_before_1970':
                get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_before_1970_offset,
                               'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_before_1970'),
            'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_after_or_on_1970':
                get_statistics(DBPEDIA_ENTRYPOINT, __get_all_comedy_films_with_directors_birthday_after_or_on_1970_offset,
                               'all_comedy_films_in_dbpedia_with_a_director_who_has_a_birthday_after_or_on_1970'),
            'interlinks_from_dbpedia_to_wikidata': get_interlinks(DBPEDIA_ENTRYPOINT,
                                                                  __get_interlinks_from_dbpedia_to_wikidata(),
                                                                  'interlinks_from_dbpedia_to_wikidata')
        }